============================================================================================
============================================================================================
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
import pandas as pd
import numpy as np
import os
import re
import random
import time
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization
from sklearn.decomposition import PCA
import fns_models as fns
% matplotlib inline
from subprocess import check_output
print(check_output(["ls", "data"]).decode("utf-8"))
def plot_columns(sample_painting):
from PIL import Image
if len(sample_painting) > 8:
sample_painting = sample_painting.sample(8)
elif len(sample_painting) == 0:
print "[INFO]: No painting for this cluster!"
return
size = len(sample_painting)
y = 1 if size <= 4 else 2
x = size if y == 1 else (size + 1) // 2
f, ax = plt.subplots(y, x, figsize = (20,15))
for i in range(size):
im = Image.open('data/images_athenaeum/full/%d/%d.jpg' % (sample_painting.iloc[i]['author_id'],
sample_painting.iloc[i]['painting_id']))
if size == 1:
curAxis = ax
elif y == 1:
curAxis = ax[i]
else:
curAxis = ax[i / x, i % x]
curAxis.imshow(im)
curAxis.set_yticks([])
curAxis.set_xticks([])
test_id = pd.read_csv('data/test_hist_author_knn.csv')
train_id = pd.read_csv('data/train_hist_author_knn.csv')
test_id.head(1)
color_hist = pd.read_csv('data/color_hist_kmeans_206552.csv')
color_hist.head(3)
print train_id.shape
print test_id.shape
print color_hist.shape
# find the train, test data for the tsne
train_tsne = color_hist.merge(pd.DataFrame(train_id.iloc[:, 1]), how='inner', on='painting_id')
test_tsne = color_hist.merge(pd.DataFrame(test_id.iloc[:, 1]), how='inner', on='painting_id')
print train_tsne.shape
print test_tsne.shape
train_tsne.head(1)
# filtered_out = ['height_px', 'width_px']
# color_hist = color_hist.drop(filtered_out, axis=1)
color_hist['kmeans_labels'] = kmeans.labels_
print color_hist.shape
color_hist.to_csv('data/color_hist_kmeans_206552.csv', index=False)
color_hist.iloc[:,2:-2] = color_hist.iloc[:, 2:-2]\
.apply(lambda x: x.astype(np.float) / (x.sum()/3), axis = 1, raw = True)
# movement_hist_test.iloc[:,3:-1] = movement_hist_test.iloc[:, 3:-1]\
# .apply(lambda x: x.astype(np.float) / (x.sum()/3), axis = 1, raw = True)
color_hist.head(3)
%%time
# prepare Kmeans data
kmeans = KMeans(n_init = 100, n_jobs=4)
kmeans.set_params(n_clusters=7)
kmeans.fit(color_hist.iloc[:, 2:-1])
print (kmeans.labels_).shape
kmeans.labels_
color_hist.iloc[:, 2:].columns
kmeans.cluster_centers_.shape
# pd.DataFrame(kmeans.cluster_centers_).to_csv('data/kmeans_centers.csv', index=False)
def get_paintings_around_centroid(centroid, color_hist, num_paintings):
distances = color_hist.iloc[:, 2:-1].apply(lambda row: sum((row - centroid) ** 2), raw = True, axis = 1)
return color_hist.loc[distances.sort_values()[:num_paintings].index, ['painting_id', 'author_id']]
def get_paintings_around_centroid(centroid, color_hist, num_paintings):
distances = color_hist.iloc[:, 2:-1].apply(lambda row: sum((row - centroid) ** 2), raw = True, axis = 1)
return color_hist.loc[distances.nsmallest(num_paintings).index, ['painting_id', 'author_id']]
%%time
plot_columns(get_paintings_around_centroid(kmeans.cluster_centers_[0], color_hist, 4))
%%time
plot_columns(get_paintings_around_centroid(kmeans.cluster_centers_[1], color_hist, 4))
%%time
plot_columns(get_paintings_around_centroid(kmeans.cluster_centers_[2], color_hist, 4))
%%time
plot_columns(get_paintings_around_centroid(kmeans.cluster_centers_[3], color_hist, 4))
%%time
plot_columns(get_paintings_around_centroid(kmeans.cluster_centers_[4], color_hist, 4))
%%time
plot_columns(get_paintings_around_centroid(kmeans.cluster_centers_[5], color_hist, 4))
%%time
plot_columns(get_paintings_around_centroid(kmeans.cluster_centers_[6], color_hist, 4))
def plot_columns_kmeans(centroids, color_hist, num_per_cluster, art_movements = None):
base_dim = 18
from PIL import Image
f, ax = plt.subplots(centroids.shape[0], num_per_cluster,
figsize = (base_dim, base_dim * centroids.shape[0] / num_per_cluster))
for y, centroid in enumerate(centroids):
paintings = get_paintings_around_centroid(centroid, color_hist, num_per_cluster)
if art_movements is not None:
paintings = paintings.merge(art_movements[['author_id', 'painting_id', 'sup_art_movement']], how = 'left',
on = ['author_id', 'painting_id'])
for i in range(len(paintings)):
im = Image.open('data/images_athenaeum/full/%d/%d.jpg' % (paintings.iloc[i]['author_id'],
paintings.iloc[i]['painting_id']))
curAxis = ax[y, i] if num_per_cluster > 1 else ax[y]
curAxis.imshow(im)
curAxis.set_yticks([])
curAxis.set_xticks([])
if art_movements is not None:
curAxis.set_title(paintings.iloc[i]['sup_art_movement'])
if i == 0:
curAxis.set_ylabel('cluster #%d' % y)
color_hist.head(1)
%%time
plot_columns_kmeans(kmeans.cluster_centers_, color_hist, 8, art_movements=pd.read_csv('data/athenaeum_painting_movement.csv'))
# calculate the distance
kmeans_centers = pd.read_csv('data/kmeans_centers.csv')
kmeans_centers.head(1)
color_hist[color_hist['kmeans_labels'] == 0].iloc[:, 2:-1].shape
# Memory error
tsne = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
tsne.fit_transform(color_hist.iloc[:, 1:])
tsne_data = pd.concat([train_tsne.reset_index(drop=True), test_tsne], axis=0)
print tsne_data.shape
tsne_data.head(1)
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
tsne_trainsformed = tsne.fit_transform(tsne_data.iloc[:, 2:].sample(5000,random_state = 123))
tsne_data.head(1)
from time import time
from sklearn.manifold import TSNE
# Perform t-distributed stochastic neighbor embedding.
t0 = time()
tsne = TSNE(n_components=2, init='pca', random_state=2017)
trans_data = tsne.fit_transform(tsne_data.iloc[:, 2:].sample(5000)).T
t1 = time()
print("t-SNE: %.2g sec" % (t1 - t0))
ax = plt.figure(figsize=(15, 8))
tsne_trainsformed
colors = tsne_data.iloc[:, -1].sample(5000,random_state = 123)
fig = plt.figure(figsize=(18, 10))
plt.scatter(tsne_trainsformed[:,0], tsne_trainsformed[:,1],
c=np.array(colors),
cmap=plt.cm.rainbow)
plt.title("t-SNE (%.2g sec)" % (t1 - t0))
plt.axis('normal')
plt.show()
fig.savefig('data/kmeans.png', dpi=fig.dpi)
cluster_data = tsne_data.sample(5000,random_state = 123)
cluster_data.groupby(['painting_id', 'kmeans_labels']).agg('sum').head(3)
cluster_data[cluster_data['author_id'] == 24]['kmeans_labels'].value_counts()
authors = pd.read_csv('data/athenaeum_authors.csv')
sum(authors.first_name == 'Vincent')
authors[authors.last_name == 'Gogh']
authors[authors.last_name == 'Monet']
van_gogh_data = cluster_data[cluster_data['author_id'] == 789][['author_id', 'painting_id' ,'kmeans_labels']]
van_gogh_data.head(5)
van_gogh_data.iloc[:, 1:].groupby(['kmeans_labels']).agg('count').reset_index().rename(columns={'painting_id':'painting_num'})
van_gogh_clusters = {}
for i in range(7):
name = 'van_gogh_%d' % i
van_gogh_clusters[name] = van_gogh_data[van_gogh_data['kmeans_labels'] == i]
# van_gogh_clusters
plot_columns(van_gogh_clusters['van_gogh_0'])
plot_columns(van_gogh_clusters['van_gogh_1'])
plot_columns(van_gogh_clusters['van_gogh_2'])
plot_columns(van_gogh_clusters['van_gogh_3'])
plot_columns(van_gogh_clusters['van_gogh_4'])
# No cluster 5 for van gogh
plot_columns(van_gogh_clusters['van_gogh_5'])
plot_columns(van_gogh_clusters['van_gogh_6'])
van_gogh_6 = van_gogh_data[van_gogh_data['kmeans_labels'] == 6]
van_gogh_6.head(3)
Monet_data = cluster_data[cluster_data['author_id'] == 13][['author_id', 'painting_id' ,'kmeans_labels']]
Monet_data.head(3)
Monet_data.iloc[:, 1:].groupby(['kmeans_labels']).agg('count').reset_index().rename(columns={'painting_id':'painting_num'})
monet_clusters = {}
for i in range(7):
name = 'monet_%d' % i
monet_clusters[name] = Monet_data[Monet_data['kmeans_labels'] == i]
# monet_clusters
plot_columns(monet_clusters['monet_0'])
plot_columns(monet_clusters['monet_1'])
plot_columns(monet_clusters['monet_2'])
plot_columns(monet_clusters['monet_3'])
plot_columns(monet_clusters['monet_4'])
plot_columns(monet_clusters['monet_5'])
plot_columns(monet_clusters['monet_6'])
cluster_data.head(2)
cluster_data[['painting_id', 'kmeans_labels']].groupby('kmeans_labels').agg('count').rename(columns={'painting_id': 'painting_num'})
cluster_data[cluster_data['kmeans_labels'] == 0].head(3)
paintings_clusters = {}
for i in range(7):
name = 'cluster_%d' % i
paintings_clusters[name] = cluster_data[cluster_data['kmeans_labels'] == i]
# paintings_clusters
plot_columns(paintings_clusters['cluster_0'])
plot_columns(paintings_clusters['cluster_1'])
plot_columns(paintings_clusters['cluster_2'])
plot_columns(paintings_clusters['cluster_3'])
plot_columns(paintings_clusters['cluster_4'])
plot_columns(paintings_clusters['cluster_5'])
plot_columns(paintings_clusters['cluster_6'])
color_hist.loc[:,'painting_id']
distances.nsmallest(4).index